import numpy as np
import numexpr as ne
import numpy.random as npr
# First, a simple function to subtract a 'gradient' from parameters
def numpy_sgd_update(param, grad):
param -= grad
return param
def numexpr_sgd_update(param, grad):
ne.evaluate('param - grad', out=param)
return param
# Check that the functions work, the output is the same so we're good
param = npr.randn(3)
grad = npr.randn(3)
print(numpy_sgd_update(param.copy(), grad))
print(numexpr_sgd_update(param.copy(), grad))
#First, use vectors the same size as a 4096*4096 fully-connected layer as used in VGG-19
np_param = npr.randn(4096*4096).astype(np.float32)
ne_param = np_param.copy()
grad = 0.01 * npr.randn(4096*4096).astype(np.float32)
%%timeit
numpy_sgd_update(np_param, grad)
%%timeit
numexpr_sgd_update(ne_param, grad)
# Next, a smaller vector approximately the size of a 3x3x3x64 convolution filter
np_param = npr.randn(3*3*3*64).astype(np.float32)
ne_param = np_param.copy()
grad = 0.01 * npr.randn(3*3*3*64).astype(np.float32)
%%timeit
numpy_sgd_update(np_param, grad)
%%timeit
numexpr_sgd_update(ne_param, grad)
def numpy_adam_update(param, grad, m, v):
m += (1 - 0.9) * (grad - m)
v += (1 - 0.999) * (grad * grad - v)
param -= 0.001 * m / (np.sqrt(v) + 1e-8)
return param, m, v
def numexpr_adam_update(param, grad, m, v):
ne.evaluate('m + (1 - 0.9) * (grad - m)', out=m, casting='same_kind')
ne.evaluate('v + (1 - 0.999) * (grad * grad - v)', out=v, casting='same_kind')
ne.evaluate('param - 0.001 * m / (sqrt(v) + 1e-8)', out=param, casting='same_kind')
return param, m, v
np_param = npr.randn(3).astype(np.float32)
np_m = npr.randn(3).astype(np.float32)
np_v = npr.rand(3).astype(np.float32)
ne_param = np_param.copy()
ne_m = np_m.copy()
ne_v = np_v.copy()
grad = 0.01 * npr.randn(3).astype(np.float32)
print(numpy_adam_update(np_param, grad, np_m, np_v))
print(numexpr_adam_update(ne_param, grad, ne_m, ne_v))
The functions return the same result so let's try some timing
#First, use vectors the same size as a 4096*4096 fully-connected layer as used in VGG-19
np_param = npr.randn(4096*4096).astype(np.float32)
np_m = npr.randn(4096*4096).astype(np.float32)
np_v = npr.rand(4096*4096).astype(np.float32)
ne_param = np_param.copy()
ne_m = np_m.copy()
ne_v = np_v.copy()
grad = 0.01 * npr.randn(4096*4096).astype(np.float32)
%%timeit
numpy_adam_update(np_param, grad, np_m, np_v)
%%timeit
numexpr_adam_update(ne_param, grad, ne_m, ne_v)
#First, use vectors the same size as a 3x3x3x64 convolutional layer
np_param = npr.randn(3*3*3*64).astype(np.float32)
np_m = npr.randn(3*3*3*64).astype(np.float32)
np_v = npr.rand(3*3*3*64).astype(np.float32)
ne_param = np_param.copy()
ne_m = np_m.copy()
ne_v = np_v.copy()
grad = 0.01 * npr.randn(3*3*3*64).astype(np.float32)
%%timeit
numpy_adam_update(np_param, grad, np_m, np_v)
%%timeit
numexpr_adam_update(ne_param, grad, ne_m, ne_v)
# Make sure these things give the same result
a = np.array([0.5, -0.1, -1.0, 5.0], dtype=np.float32)
print(np.fmax(a, 0))
print(ne.evaluate('where(a>0,a,0)'))
np_parm = npr.randn(4096*4096).astype(np.float32)
ne_parm = np_parm.copy()
%%timeit
np.fmax(np_parm, 0)
%%timeit
ne.evaluate('where(ne_parm>0, ne_parm, 0)')
def opt_eval(np_func, ne_func, *args):
"""
Crude function to evaluate numpy version if the arguments are small enough,
or numexpr version if arguments are of large enough size
"""
if np.prod(args[0].shape) < 2**10:
return np_func(*args)
else:
return ne_func(*args)
#For this, compute tanh of relu on array
def np_func(data):
return np.tanh(np.fmax(data, 0))
def ne_func(data):
return ne.evaluate('tanh(where(data>0, data, 0))')
#smallest array
data0 = npr.randn(2**6).astype(np.float32)
#small array
data1 = npr.randn(2**12).astype(np.float32)
#medium array
data2 = npr.randn(2**18).astype(np.float32)
#large array
data3 = npr.randn(2**24).astype(np.float32)
On the small array, the overhead of calculating the dimensions and the if-statement appear to be about 6 microseconds per iteration. This is similar to the overhead in numexpr evaluation
%%timeit
np_func(data0)
%%timeit
ne_func(data0)
%%timeit
opt_eval(np_func, ne_func, data0)
The if-statement and dimension calculation in the optimal evaluation makes it a tad slower than the numexpr evaluation, but still faster than pure numpy
%%timeit
np_func(data1)
%%timeit
ne_func(data1)
%%timeit
opt_eval(np_func, ne_func, data1)
Here, the overhead is markedly less significant and the speedup is more than 10x. This is a possible mini-batch size in a neural network, suggesting great speedups are possible.
%%timeit
np_func(data2)
%%timeit
ne_func(data2)
%%timeit
opt_eval(np_func, ne_func, data2)
The speed-ups only grow as the size of the array grows, being more than 10x here.
%%timeit
np_func(data3)
%%timeit
ne_func(data3)
%%timeit
opt_eval(np_func, ne_func, data3)